'''
Functions that retrieve movie titles from the Internet Movie Poster Awards
site.

Created on Jan 6, 2011

@author: matt
'''

import sys
import urllib2
import re

base_imp_url = "http://www.impawards.com"
begin_year = 1912
end_year = 2011

def fetch_imp_titles_for_year(year):
    """
    Fetches movie titles from the IMP Awards site for the specified year
    and returns a list of the titles for that year.
    """
    
    # build the url string for the specified year
    url_string = base_imp_url + '/' + str(year) + '/std.html'
    
    # open the connection
    url_conn = urllib2.urlopen(url_string)
    
    page_contents = url_conn.read()
    
    # extract the titles
    titles = re.findall('<td><font size=\+1>(.*?)</font></td>',
                         page_contents)
    
    return titles

if __name__ == '__main__':
    print '-- Movie Title Extractor Utility --'
    
    output_file = sys.argv[1]
    
    titles = []
    
    for year in range(begin_year, end_year + 1):
        print 'Fetching titles for', year
        titles.extend(fetch_imp_titles_for_year(year))
    
    titles.sort()
    
    out_file = open(output_file, 'w')
    for title in titles:
        out_file.write(title + '\n')
        
    out_file.close()